from functools import reduce
import phpserialize
import time
import requests
import json
import random
import logging
from logging.handlers import RotatingFileHandler
import os
from lxml import etree
from fake_useragent import UserAgent


#
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
# 定义一个RotatingFileHandler，最多备份3个日志文件，每个日志文件最大1K
rHandler = RotatingFileHandler("sgwx_log.txt", maxBytes=10 * 1024, backupCount=5)
rHandler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
rHandler.setFormatter(formatter)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(formatter)
logger.addHandler(rHandler)
logger.addHandler(console)

yq = 5
ua = UserAgent().random
header = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Cookie": "ABTEST=2|1561358499|v1; IPLOC=CN1100; SUID=1A4FC17C4631990A000000005D1070A3; SUV=00CDEE0D7CC14F1A5D1070A35DFF7862; SUID=1A4FC17C2113940A000000005D1070A3; weixinIndexVisited=1; pgv_pvi=2997798912; ld=tZllllllll2N9tTHlllllV1S8k7lllllKGV5pklllltlllllpylll5@@@@@@@@@@; LSTMV=682%2C416; LCLKINT=8742; SNUID=F7A22C91EDE961A343DEFB22EDD4F97E; sct=14; JSESSIONID=aaaoA1hhCs7Mf8ji7bsTw",
    "Host": "weixin.sogou.com",
    "Pragma": "no-cache",
    "Referer": "https://weixin.sogou.com/weixin?usip=&query=%E9%98%BF%E5%B0%94%E6%B3%95%E7%8B%97&ft=&tsn=1&et=&interation=&type=2&wxid=&page=2&ie=utf8",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}


def detecting_keywords():
    key_list = get_keyword(yq)
    if len(key_list) == 0:
        return []
    all_key_list = sorted(key_list, key=lambda x: x[0], reverse=False)
    data_list = []
    for data in all_key_list:
        keyword = data[1]
        pid = data[2]
        page = data[4]
        if '"wx_data"' not in page:
            update_flag(keyword, pid, yq)
            continue
        data_list.append([keyword, pid])
    return data_list


def random_steep():
    """
    防止封号，随机暂停
    :return:
    """
    a = random.randint(2, 8)
    logger.info("暂停{}秒".format(a))
    time.sleep(a)


def get_cookies():
    """
    使用selenium获取cookies的值，将其存在文件中
    :return:
    """
    url = 'https://v.sogou.com/v?ie=utf8&query=&p=40030600'
    headers = {'User-Agent': ua}
    rst = requests.get(url=url, headers=headers, allow_redirects=False)
    cookies = rst.cookies.get_dict()
    with open("wx_cookie.json", "w+")as f:
        f.write(json.dumps(cookies))
        f.close()
    logger.info("wx_cookies获取成功")


def timestamp(dt):
    """
    将时间转为时间戳
    :param dt:
    :return:
    """
    timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
    timestamp = time.mktime(timeArray)
    return timestamp


def yesterday():
    """
    获取昨天此时的时间戳
    :return:
    """
    timestamp = int(time.time())
    yesterday = timestamp - 24 * 60 * 60
    return yesterday


def get_req():
    """
    打开cookies文件获取cookie，进行requests请求
    :return:
    """
    logger.info("打开cookie文件获取cookies")
    with open("wx_cookie.json", "r")as f:
        cookies = json.loads(f.readline())
    snuid = cookies["SNUID"]
    cookie = "ABTEST=2|1561358499|v1; IPLOC=CN1100; SUID=1A4FC17C4631990A000000005D1070A3; SUV=00CDEE0D7CC14F1A5D1070A35DFF7862; SUID=1A4FC17C2113940A000000005D1070A3; weixinIndexVisited=1; pgv_pvi=2997798912; ld=tZllllllll2N9tTHlllllV1S8k7lllllKGV5pklllltlllllpylll5@@@@@@@@@@; LSTMV=682%2C416; LCLKINT=8742; SNUID={}; sct=14; JSESSIONID=aaaoA1hhCs7Mf8ji7bsTw".format(
        snuid)
    header["Cookie"] = cookie
    header["User-Agent"] = ua
    req = requests.Session()
    req.headers = header
    return req


def cookies_expried():
    """
    判断cookies是否过期，若过期会自动登录获取cookies
    :return:
    """
    file = os.path.isfile("wx_cookie.json")
    if not file:
        get_cookies()
    req = get_req()
    url = "https://weixin.sogou.com/weixin?usip=&query=%E9%98%BF%E5%B0%94%E6%B3%95%E7%8B%97&ft=&tsn=1&et=&interation=&type=2&wxid=&page=3&ie=utf8"
    response = req.get(url, allow_redirects=False)
    logger.info("验证cookies是否可用")
    login_state = True
    if response.status_code == 302:
        login_state = False
    if login_state:
        logger.info("cookies可用")
        return req
    else:
        logger.info("cookies不可用，重新获取cookies")
        get_cookies()
        req = get_req()
        return req


def into_database(items):
    pipeline = DataPipeline()
    pipeline.process_item(items=items)


def list_dict_duplicate_removal(data_list):
    run_function = lambda x, y: x if y in x else x + [y]
    return reduce(run_function, [[], ] + data_list)


def main():
    while True:
        keywords = detecting_keywords()
        if keywords == []:
            logger.info("本次无需采集信息 暂停1m")
            time.sleep(60)
            continue
        for key in keywords:
            keyword = key[0]
            pid = key[1]
            logger.info("当前关键词是：{}".format(keyword))
            start_url = "https://weixin.sogou.com/weixin?usip=&query={}&ft=&tsn=1&et=&interation=&type=2&wxid=&page={}&ie=utf8"
            req = cookies_expried()
            for i in range(1, 11):
                random_steep()
                proxies = {
                    "http": "http://" + get_proxy()
                }
                response = req.get(url=start_url.format(keyword, i), allow_redirects=False, proxies=proxies)
                if response.status_code == 302:
                    print("验证码 需要跟换ip")
                rex = etree.HTML(response.text)
                infos = rex.xpath('//ul[@class="news-list"]/li')  # 信息列表
                logger.info("本页信息条数：{}".format(len(infos)))
                if len(infos) == 0:
                    update_flag(keyword, pid, yq)
                    break
                rank = 0
                save_data = []
                for data in infos:
                    rank += 1
                    ranks = (int(i) - 1) * 10 + rank
                    link = data.xpath('string(./div[@class="txt-box"]/h3/a/@data-share)')
                    _title = data.xpath('string(./div[@class="txt-box"]/h3/a)')
                    summary = data.xpath('string(./div[@class="txt-box"]/p)')
                    screen_name = data.xpath('string(./div[@class="txt-box"]/div[@class="s-p"]/a)')
                    timestamps = data.xpath('string(./div[@class="txt-box"]/div[@class="s-p"]/@t)')
                    tup = (pid, keyword, timestamps, _title, 1, yq)
                    data = phpserialize.dumps(tup)
                    guid = hash(data)
                    item = dict()
                    item['mid'] = 0
                    item['guid'] = guid
                    item['short_url'] = link
                    item['rank_old'] = 0
                    item['yq'] = yq
                    item['nr'] = 1
                    item['sh'] = 0
                    item['qr'] = 0
                    item['cjtype'] = 1
                    item['update_time'] = int(time.time())
                    item['del_time'] = 0
                    item['status'] = 1
                    item['isnew_email'] = 0
                    item['ftype'] = 1
                    item['isnew_wx'] = 0
                    item['froms'] = 0
                    item['zf'] = 1
                    item['title'] = _title
                    item['url'] = link
                    item['screen_name'] = screen_name
                    item['content'] = summary
                    item['screen_time'] = timestamps
                    item['dzs'] = 0
                    item['zfs'] = 0
                    item['pls'] = 0
                    item['keyword'] = keyword
                    item['rank'] = ranks
                    item['page'] = i
                    item['pid'] = pid
                    item['screen_time'] = timestamps
                    item['is_white'] = 1
                    save_data.append(item)
                save_datas = list_dict_duplicate_removal(save_data)
                into_database(save_datas)
                next_page = rex.xpath('//a[@id="sogou_next"]/@href')
                if len(next_page) == 0:
                    logger.info("已完成 当前第{}页".format(i + 1))
                    update_flag(keyword, pid, yq)
                    break
                if int(i) > 10:
                    logger.info("已完成 当前第{}页".format(i + 1))
                    update_flag(keyword, pid, yq)
                    break
                logger.info("翻页 当前第{}页".format(i + 1))


if __name__ == '__main__':
    main()